We are trying to understand if some combination of odds predicts over/under values well. First, we read the matches and odd_details datasets.
## Classes 'data.table' and 'data.frame': 3129 obs. of 10 variables:
## $ matchId : chr "KjF6FiA6" "ILVbJgQm" "SGIEDVvJ" "YwL5xFHJ" ...
## $ home : chr "tottenham" "aston villa" "wolves" "bolton" ...
## $ away : chr "manchester city" "west ham" "stoke city" "fulham" ...
## $ homescore: num 0 3 2 0 0 2 1 6 1 3 ...
## $ awayscore: num 0 0 1 0 4 2 0 0 1 0 ...
## $ isover : logi FALSE TRUE TRUE FALSE TRUE TRUE ...
## $ is1 : logi FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ is2 : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ isX : logi TRUE FALSE FALSE TRUE FALSE TRUE ...
## $ isbts : logi FALSE FALSE TRUE FALSE FALSE TRUE ...
## - attr(*, ".internal.selfref")=<externalptr>
## Classes 'data.table' and 'data.frame': 5675665 obs. of 7 variables:
## $ matchId : chr "004f4ING" "004f4ING" "004f4ING" "004f4ING" ...
## $ betType : chr "1x2" "1x2" "1x2" "1x2" ...
## $ oddtype : chr "odd1" "odd1" "odd1" "odd1" ...
## $ bookmaker : chr "10Bet" "10Bet" "12BET" "12BET" ...
## $ date : num 1.42e+09 1.42e+09 1.42e+09 1.42e+09 1.42e+09 ...
## $ odd : num 1.67 1.65 1.67 1.65 1.7 1.67 1.68 1.61 1.62 1.67 ...
## $ totalhandicap: chr NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
We will need to transform the matches data set into a format which shows us actual over/under outcomes: (and 1x2 outcomes, for Part b)
# drop unplayed games, split scores, define columns showing
# outcomes
matches <- na.omit(matches[, `:=`(c("homescore", "awayscore"),
tstrsplit(score, ":"))][, `:=`(c("homescore", "awayscore"),
list(as.numeric(homescore), as.numeric(awayscore)))][, `:=`(c("date",
"leagueId", "type", "score"), NULL)][, `:=`(c("isover", "is1",
"is2", "isX", "isbts"), list(homescore + awayscore > handicaplvl,
homescore > awayscore, homescore < awayscore, homescore ==
awayscore, homescore > 0 & awayscore > 0))])
The odd_details data set contains the odds we will use to perform PCA - however there are some non-unique odd types. For example, both the “over 2.5” and “over 0.5” odds have the “over” odd type. To make these unique, we will concatenate them with the handicap levels:
# elaborate oddtypes for an easier wide table construction
odd_details_latest <- (copy(odd_details)[betType == "ha", `:=`(oddtype,
paste(betType, oddtype, sep = ""))][betType == "ou", `:=`(oddtype,
paste(oddtype, totalhandicap, sep = ""))][betType == "ah",
`:=`(oddtype, paste(betType, oddtype, totalhandicap, sep = ""))][,
`:=`(maxdate, max(date)), by = list(matchId, bookmaker, oddtype)][date ==
maxdate][, `:=`(c("maxdate", "date", "betType", "totalhandicap"),
NULL)])
# put into wide table format, each odd type is a feature
odd_details_wide <- dcast(copy(odd_details_latest), matchId +
bookmaker ~ oddtype, value.var = "odd")
Let’s view a small part of our wide table:
## Classes 'data.table' and 'data.frame': 10 obs. of 10 variables:
## $ matchId : chr "004f4ING" "004f4ING" "004f4ING" "004f4ING" ...
## $ bookmaker: chr "10Bet" "12BET" "188BET" "1xBet" ...
## $ 12 : num 1.22 NA NA 1.28 1.25 1.25 1.25 1.24 1.27 1.22
## $ 1X : num 1.12 NA NA 1.15 1.14 1.12 1.12 1.12 1.12 1.12
## $ NO : num 1.69 NA NA 1.72 1.78 NA 1.72 1.72 1.65 1.67
## $ X2 : num 2.15 NA NA 2.4 2.25 2.2 2.26 2.25 2.3 2.2
## $ YES : num 2.05 NA NA 2.12 2.05 NA 2.02 2.02 2.1 2.05
## $ ah1+0.5 : num NA NA NA NA NA NA NA NA NA NA
## $ ah1+1 : num NA NA NA 1.04 NA NA NA NA NA NA
## $ ah1+1.5 : num NA NA NA 1.02 NA NA NA NA NA NA
## - attr(*, "sorted")= chr "matchId" "bookmaker"
## - attr(*, ".internal.selfref")=<externalptr>
## matchId bookmaker 12 1X NO X2 YES ah1+0.5 ah1+1 ah1+1.5
## 1: 004f4ING 10Bet 1.22 1.12 1.69 2.15 2.05 NA NA NA
## 2: 004f4ING 12BET NA NA NA NA NA NA NA NA
## 3: 004f4ING 188BET NA NA NA NA NA NA NA NA
## 4: 004f4ING 1xBet 1.28 1.15 1.72 2.40 2.12 NA 1.04 1.02
## 5: 004f4ING BetVictor 1.25 1.14 1.78 2.25 2.05 NA NA NA
## 6: 004f4ING Betclic 1.25 1.12 NA 2.20 NA NA NA NA
## 7: 004f4ING Betsafe 1.25 1.12 1.72 2.26 2.02 NA NA NA
## 8: 004f4ING Betsson 1.24 1.12 1.72 2.25 2.02 NA NA NA
## 9: 004f4ING Betway 1.27 1.12 1.65 2.30 2.10 NA NA NA
## 10: 004f4ING ComeOn 1.22 1.12 1.67 2.20 2.05 NA NA NA
AH (Asian Handicap) data introduces a lot of missing values, we might consider removing them and then filtering other columns which introduce missing values:
odd_details_wide_noah <- odd_details_wide[, `:=`(c(grep("ah",
colnames(odd_details_wide), fixed = TRUE)), NULL)]
# count NAs
na_count_by_bm <- copy(odd_details_wide_noah)[, lapply(.SD, function(x) sum(is.na(x))),
by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
# keep the 5 best bookmakers
bad_bookmakers <- na_count_by_bm[, total, by = bookmaker][order(-total)]
keep_bookmakers <- bad_bookmakers$bookmaker[(length(bad_bookmakers$bookmaker) -
4):length(bad_bookmakers$bookmaker)]
odd_details_wide_noah <- odd_details_wide_noah[bookmaker %in%
keep_bookmakers]
# function to remove columns with many NAs
filter.oddtypes <- function(wide_table, kgf) {
if ((length(wide_table) < 11) | kgf < 20) {
return(wide_table)
}
na_count <- copy(wide_table)[, lapply(.SD, function(x) sum(is.na(x))),
by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
sum1 <- sum(na_count$total)
bad_oddtypes <- unique(na_count[, colnames(.SD[, 3:(length(.SD) -
1)])[max.col(na_count[, 3:(length(.SD) - 1)], ties.method = "first")]])
print(bad_oddtypes)
na_count <- na_count[, `:=`(c(bad_oddtypes), NULL)][, `:=`(total,
rowSums(.SD[, 3:(length(.SD) - 1)]))]
sum2 <- sum(na_count$total)
removed <- sum1 - sum2
rowremposs <- max(wide_table[, .N, by = matchId]$N) * (length(wide_table) -
2)
print(paste("The removed number of NA entries is", removed,
"deleting a match instead could remove a maximum of",
rowremposs, "NA values."))
keepgoingfactor <- removed/rowremposs
wide_table <- wide_table[, `:=`(c(bad_oddtypes), NULL)]
return(filter.oddtypes(wide_table, keepgoingfactor))
}
odd_details_wide_filtered <- filter.oddtypes(copy(odd_details_wide_noah),
100)
Functions to do PCA and MDS analyses:
pca.analysis <- function(wide_filtered_table, match.tbl, bmname,
condition, suppress = FALSE) {
df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker ==
bmname][, `:=`(bookmaker, NULL)]))
df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE,
scale = max(x)))
rownames(df) <- df[["matchId"]]
pca <- princomp(df[, -1])
if (!suppress) {
plot(pca, main = paste("PCA for", bmname))
print(paste("PCA for", bmname))
print(summary(pca))
print(pca$loadings)
}
mscores <- as.data.frame(pca$scores)
mscores[["matchId"]] <- rownames(mscores)
pca_m <- unique(merge(mscores, match.tbl[, c("matchId", ..condition)],
by = "matchId"))
plot(pca_m[["Comp.1"]], pca_m[["Comp.2"]], col = ifelse(pca_m[[condition]],
"red", "black"), xlab = "Comp. 1", ylab = "Comp. 2",
main = paste("PCA shifted data for", bmname, "and", condition))
legend("bottomright", c(condition, paste("NOT", condition)),
fill = c("red", "black"), cex = 0.75)
return(list(pca_m, pca))
}
do.MDS <- function(wide_filtered_table, match.tbl, mthd, bmname,
condition) {
df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker ==
bmname][, `:=`(bookmaker, NULL)]))
df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE,
scale = max(x)))
rownames(df) <- df[["matchId"]]
d <- dist(df[, -1], method = mthd)
mds <- as.data.frame(cmdscale(d))
mds[["matchId"]] <- rownames(mds)
mds_m <- unique(merge(mds, match.tbl[, c("matchId", ..condition)],
by = "matchId"))
plot(mds_m[["V1"]], mds_m[["V2"]], main = paste(bmname, mthd,
"Distance MDS for", condition), xlab = "Dim1", ylab = "Dim2",
col = ifelse(mds_m[[condition]], "red", "black"))
legend("bottomright", c(condition, paste("NOT", condition)),
fill = c("red", "black"), cex = 0.75)
invisible(NULL)
}
Do the PCA analysis for 1xBet, one of our selected bookmakers
pca1xB_ou <- pca.analysis(odd_details_wide_filtered, matches,
"1xBet", "isover")
## [1] "PCA for 1xBet"
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.2591188 0.05757021 0.019418429 0.013205701
## Proportion of Variance 0.9425271 0.04652556 0.005293272 0.002448043
## Cumulative Proportion 0.9425271 0.98905263 0.994345906 0.996793948
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.009926890 0.0079534815 0.0068718105 0.0044006633
## Proportion of Variance 0.001383319 0.0008879951 0.0006628853 0.0002718519
## Cumulative Proportion 0.998177268 0.9990652629 0.9997281481 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5 0.228 0.457 0.318 0.221 0.740 0.185
## over2.5 0.405 0.155 -0.581 0.570 -0.126 -0.358
## over3.5 0.460 0.360 0.352 -0.170 -0.695 -0.127
## over4.5 0.512 0.406 -0.536 0.497 0.126 0.103
## under1.5 -0.341 0.470 -0.452 -0.537 -0.131 0.161 0.356
## under2.5 -0.318 0.458 0.303 0.134 -0.756
## under3.5 -0.254 0.410 0.115 0.238 0.194 0.452 -0.398 0.545
## under4.5 -0.167 0.274 0.259 0.152 0.123 0.352 -0.811
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
The PCA analysis for the rest of the bookmakers looks like this:
## [1] "PCA for ComeOn"
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.2973914 0.07296216 0.013761519 0.0090895851
## Proportion of Variance 0.9389338 0.05651627 0.002010531 0.0008771345
## Cumulative Proportion 0.9389338 0.99545003 0.997460566 0.9983377005
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.0082676987 0.0073477169 0.0044355514 0.0038157904
## Proportion of Variance 0.0007256837 0.0005731693 0.0002088687 0.0001545778
## Cumulative Proportion 0.9990633842 0.9996365535 0.9998454222 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5 0.229 0.160 0.115 0.325 0.275 0.291 0.725 0.341
## over2.5 0.322 0.214 0.780 -0.414 -0.259
## over3.5 0.460 0.364 -0.541 0.471 0.193 -0.299 0.110
## over4.5 0.502 0.420 -0.564 0.306 -0.308 -0.157 -0.199
## under1.5 -0.396 0.484 -0.124 -0.491 -0.463 0.169 0.288 0.166
## under2.5 -0.337 0.427 0.106 0.421 -0.711
## under3.5 -0.275 0.368 0.124 0.501 0.413 -0.526 0.275
## under4.5 -0.183 0.261 0.163 0.211 0.285 0.150 -0.848
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
## [1] "PCA for youwin"
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.2410126 0.05393239 0.01429719 0.010020746
## Proportion of Variance 0.9455020 0.04734588 0.00332724 0.001634493
## Cumulative Proportion 0.9455020 0.99284787 0.99617511 0.997809607
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.0074633453 0.0062020250 0.0050104154 0.0039110492
## Proportion of Variance 0.0009066715 0.0006261089 0.0004086301 0.0002489829
## Cumulative Proportion 0.9987162781 0.9993423870 0.9997510171 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5 0.233 0.150 0.394 0.300 0.811 0.118
## over2.5 0.373 0.236 0.408 -0.163 0.450 -0.511 0.319 -0.216
## over3.5 0.445 0.342 0.178 -0.394 -0.641 -0.232 0.182
## over4.5 0.487 0.424 -0.606 0.445 0.117
## under1.5 -0.420 0.546 -0.314 -0.572 0.295
## under2.5 -0.318 0.401 0.121 0.241 -0.435 0.100 0.607 -0.311
## under3.5 -0.254 0.317 0.268 0.400 -0.249 0.727
## under4.5 -0.171 0.256 0.307 0.275 -0.675 -0.528
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
## [1] "PCA for Betfair"
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.2609809 0.06606928 0.024170156 0.009849291
## Proportion of Variance 0.9290363 0.05954076 0.007968455 0.001323199
## Cumulative Proportion 0.9290363 0.98857702 0.996545476 0.997868676
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.0081504617 0.0065419734 0.005326085 0.0043197846
## Proportion of Variance 0.0009061073 0.0005837579 0.000386929 0.0002545302
## Cumulative Proportion 0.9987747828 0.9993585407 0.999745470 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5 0.250 0.270 0.449 0.361 0.697 0.137 0.138
## over2.5 0.398 0.195 0.324 0.506 -0.528 -0.316 -0.236
## over3.5 0.503 0.315 0.453 -0.571 0.254 -0.165 0.120
## over4.5 0.441 0.459 -0.766
## under1.5 -0.342 0.518 0.109 -0.240 -0.542 0.350 0.192 0.303
## under2.5 -0.329 0.442 0.111 0.230 -0.166 -0.771
## under3.5 -0.267 0.335 0.375 0.197 -0.295 -0.580 0.457
## under4.5 -0.189 0.259 0.132 0.374 -0.405 0.748 0.124
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
## [1] "PCA for Betfair Exchange"
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.2397279 0.08584109 0.016023518 0.011871509
## Proportion of Variance 0.8779785 0.11257378 0.003922496 0.002153072
## Cumulative Proportion 0.8779785 0.99055229 0.994474786 0.996627858
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.010208065 0.007612929 0.0064106690 0.0041798190
## Proportion of Variance 0.001591965 0.000885422 0.0006278465 0.0002669081
## Cumulative Proportion 0.998219823 0.999105245 0.9997330919 1.0000000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5 0.237 0.111 0.124 0.538 0.734 0.230 0.175
## over2.5 0.392 0.235 0.384 0.548 -0.546 -0.152 -0.147
## over3.5 0.489 0.336 0.453 -0.638 0.147
## over4.5 0.497 0.351 -0.789
## under1.5 -0.337 0.555 -0.685 0.300 0.109
## under2.5 -0.316 0.479 0.185 0.287 -0.619 -0.412
## under3.5 -0.247 0.337 0.509 -0.198 0.724
## under4.5 -0.172 0.211 0.471 -0.114 0.667 -0.491
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## Cumulative Var 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000
It seems that over 90% of the total variance seems to be covered by the first principal component for all bookmakers. We can note that each plot of the data on the first two principal components is bow shaped - this indicates that our analysis produced somewhat consistent results. The plots show some small but discernable separation between the red and black dots, although the center of the plot does not seem very decisive for any bookmaker and it would be difficult to use these odds predictively unless they were quite extreme.
In the case of each bookmaker, the eigencvectors of PCA are made up of various over and under odds with handicaps between 1.5 and 4.5. We can surmise that most of the variance in the data comes from such odds which capture the over/under likelihoods better than they do the 1x2 likelihoods.
We can simply run the do.MDS() function shown above to the same datasets to do a MDS analysis, simply by calling:
do.MDS(odd_details_wide_filtered, matches, "euclidean", "1xBet",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "1xBet",
"isover")
The rest of our MDS plots are below:
The main finding is that Euclidean MDS produces bow-shaped plots that are very similar to that of PCA, and there does seem to be some separation between the black and red points. It also can be said that the spread of the points looks higher with Euclidean MDS than in PCA. In Manhattan MDS, possibly due to the distance value not being squared (and therefore negative distances being introduced), we see a divergent pattern that makes it hard to make any predictions. It also must be noted that neither of the MDS methods offer anything like PCA eigenvectors, which can be used to understand where the variance in the data comes from. This (and the higher plot spread) makes MDS look worse as a predictive tool than PCA.
In the plots below, “is1”, “is2” and “isX” indicate a home team win, and away team win, and a draw, respectively.
An interesting finding with these plots is that, while the PCA model uses only over and under odds in the eigenvectors, there is some kind of a predictive pattern visible, especially with the “is1” (home team wins) plots. We can furthermore see that a hypothetical data point in the edges of the bow-shaped plot indicates that a draw is not likely. However, it is still difficult to say whether a point in the center of the bow will be a draw.
Image structure and dimensions:
str(image)
## num [1:512, 1:512, 1:3] 0.796 0.796 0.784 0.792 0.816 ...
dim(image)
## [1] 512 512 3
Add jitter to images:
image_noisy <- jitter(image, amount = 0.1)
# ensure intensities stay within 0-1 range
image_noisy[which(image_noisy > 1)] <- 1
image_noisy[which(image_noisy < 0)] <- 0
Convert a smaller, noisy image to greyscale to do PCA analysis:
dim(image_small)
## [1] 256 256 3
image_noisy_small <- jitter(image_small, amount = 0.1)
image_noisy_small[which(image_noisy_small > 1)] <- 1
image_noisy_small[which(image_noisy_small < 0)] <- 0
image_noisy_greyscale_small <- (image_noisy_small[, , 1] + image_noisy_small[,
, 2] + image_noisy_small[, , 3])/3
Divide the image into 3x3 submatrices (patches) and do PCA with “position within patch”" (top-right, bottom, etc.) as features and each patch as instances:
plot(pca_patch)
summary(pca_patch)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.5442816 0.09822836 0.09044627 0.06012628
## Proportion of Variance 0.9020787 0.02938130 0.02491027 0.01100844
## Cumulative Proportion 0.9020787 0.93146000 0.95637027 0.96737871
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.052716089 0.048900599 0.047877230 0.04101685
## Proportion of Variance 0.008462204 0.007281578 0.006979996 0.00512297
## Cumulative Proportion 0.975840910 0.983122488 0.990102484 0.99522545
## Comp.9
## Standard deviation 0.039597471
## Proportion of Variance 0.004774545
## Cumulative Proportion 1.000000000
pca_patch$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 11 0.328 0.514 0.148 0.561 0.435 0.173 0.255
## 21 0.335 0.246 0.368 0.232 0.402 -0.508 0.218 -0.410
## 31 0.332 -0.147 0.542 -0.329 0.364 -0.350 0.151 -0.400 0.170
## 12 0.334 0.395 -0.216 -0.460 -0.309 -0.198 -0.513 -0.277
## 22 0.341 -0.390 -0.441 0.474 0.558
## 32 0.335 -0.393 0.215 -0.452 -0.302 0.206 0.502 -0.311
## 13 0.330 0.146 -0.548 -0.325 0.370 -0.354 -0.134 0.402 0.146
## 23 0.335 -0.244 -0.369 0.225 0.422 0.491 -0.248 -0.398
## 33 0.330 -0.512 -0.142 0.561 -0.435 -0.155 0.270
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.111 0.111 0.111 0.111 0.111 0.111 0.111 0.111
## Cumulative Var 0.111 0.222 0.333 0.444 0.556 0.667 0.778 0.889
## Comp.9
## SS loadings 1.000
## Proportion Var 0.111
## Cumulative Var 1.000
We can see that the first principal component covers over 90 percent of the variance in each patch, and is the scaled sum of each pixel within the patch, with a slight bias for the center pixel.
We can see that the first PC is sufficient to meaningfully recreate the image, while the second and third PCs seem to show a bias for certain edges.
It can be seen that the first PC focuses on all pixels - but mostly the center, whereas the second and third PCs focus on the bottom-left and bottom-right corners, respectively.
require(data.table)
require(anytime)
setwd("./Desktop/okul/Data Mining/data/")
matches_path <- "df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds"
odd_details_path <- "df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds"
handicaplvl = 2.5
matches <- readRDS(matches_path)
# drop unplayed games, split scores, define columns showing
# outcomes
matches <- na.omit(matches[, `:=`(c("homescore", "awayscore"),
tstrsplit(score, ":"))][, `:=`(c("homescore", "awayscore"),
list(as.numeric(homescore), as.numeric(awayscore)))][, `:=`(c("date",
"leagueId", "type", "score"), NULL)][, `:=`(c("isover", "is1",
"is2", "isX", "isbts"), list(homescore + awayscore > handicaplvl,
homescore > awayscore, homescore < awayscore, homescore ==
awayscore, homescore > 0 & awayscore > 0))])
odd_details <- readRDS(odd_details_path)
# elaborate oddtypes for an easier wide table construction
odd_details_latest <- (copy(odd_details)[betType == "ha", `:=`(oddtype,
paste(betType, oddtype, sep = ""))][betType == "ou", `:=`(oddtype,
paste(oddtype, totalhandicap, sep = ""))][betType == "ah",
`:=`(oddtype, paste(betType, oddtype, totalhandicap, sep = ""))][,
`:=`(maxdate, max(date)), by = list(matchId, bookmaker, oddtype)][date ==
maxdate][, `:=`(c("maxdate", "date", "betType", "totalhandicap"),
NULL)])
# put into wide table format, each odd type is a feature
odd_details_wide <- dcast(copy(odd_details_latest), matchId +
bookmaker ~ oddtype, value.var = "odd")
odd_details_wide_noah <- copy(odd_details_wide)[, `:=`(c(grep("ah",
colnames(odd_details_wide), fixed = TRUE)), NULL)]
na_count_by_bm <- copy(odd_details_wide_noah)[, lapply(.SD, function(x) sum(is.na(x))),
by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
bad_bookmakers <- na_count_by_bm[, total, by = bookmaker][order(-total)]
keep_bookmakers <- bad_bookmakers$bookmaker[(length(bad_bookmakers$bookmaker) -
4):length(bad_bookmakers$bookmaker)]
odd_details_wide_noah <- odd_details_wide_noah[bookmaker %in%
keep_bookmakers]
filter.oddtypes <- function(wide_table, kgf) {
if ((length(wide_table) < 11) | kgf < 20) {
return(wide_table)
}
na_count <- copy(wide_table)[, lapply(.SD, function(x) sum(is.na(x))),
by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
sum1 <- sum(na_count$total)
bad_oddtypes <- unique(na_count[, colnames(.SD[, 3:(length(.SD) -
1)])[max.col(na_count[, 3:(length(.SD) - 1)], ties.method = "first")]])
print(bad_oddtypes)
na_count <- na_count[, `:=`(c(bad_oddtypes), NULL)][, `:=`(total,
rowSums(.SD[, 3:(length(.SD) - 1)]))]
sum2 <- sum(na_count$total)
removed <- sum1 - sum2
rowremposs <- max(wide_table[, .N, by = matchId]$N) * (length(wide_table) -
2)
print(paste("The removed number of NA entries is", removed,
"deleting a match instead could remove a maximum of",
rowremposs, "NA values."))
keepgoingfactor <- removed/rowremposs
wide_table <- wide_table[, `:=`(c(bad_oddtypes), NULL)]
return(filter.oddtypes(wide_table, keepgoingfactor))
}
odd_details_wide_filtered <- filter.oddtypes(copy(odd_details_wide_noah),
100)
pca.analysis <- function(wide_filtered_table, match.tbl, bmname,
condition, suppress = FALSE) {
df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker ==
bmname][, `:=`(bookmaker, NULL)]))
df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE,
scale = max(x)))
rownames(df) <- df[["matchId"]]
pca <- princomp(df[, -1])
if (!suppress) {
plot(pca, main = paste("PCA for", bmname))
print(paste("PCA for", bmname))
print(summary(pca))
print(pca$loadings)
}
mscores <- as.data.frame(pca$scores)
mscores[["matchId"]] <- rownames(mscores)
pca_m <- unique(merge(mscores, match.tbl[, c("matchId", ..condition)],
by = "matchId"))
plot(pca_m[["Comp.1"]], pca_m[["Comp.2"]], col = ifelse(pca_m[[condition]],
"red", "black"), xlab = "Comp. 1", ylab = "Comp. 2",
main = paste("PCA shifted data for", bmname, "and", condition))
legend("bottomright", c(condition, paste("NOT", condition)),
fill = c("red", "black"), cex = 0.75)
return(list(pca_m, pca))
}
do.MDS <- function(wide_filtered_table, match.tbl, mthd, bmname,
condition) {
df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker ==
bmname][, `:=`(bookmaker, NULL)]))
df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE,
scale = max(x)))
rownames(df) <- df[["matchId"]]
d <- dist(df[, -1], method = mthd)
mds <- as.data.frame(cmdscale(d))
mds[["matchId"]] <- rownames(mds)
mds_m <- unique(merge(mds, match.tbl[, c("matchId", ..condition)],
by = "matchId"))
plot(mds_m[["V1"]], mds_m[["V2"]], main = paste(bmname, mthd,
"Distance MDS for", condition), xlab = "Dim1", ylab = "Dim2",
col = ifelse(mds_m[[condition]], "red", "black"))
legend("bottomright", c(condition, paste("NOT", condition)),
fill = c("red", "black"), cex = 0.75)
invisible(NULL)
}
pca1xB_ou <- pca.analysis(odd_details_wide_filtered, matches,
"1xBet", "isover")
pcacom_ou <- pca.analysis(odd_details_wide_filtered, matches,
"ComeOn", "isover")
pcayou_ou <- pca.analysis(odd_details_wide_filtered, matches,
"youwin", "isover")
pcaBet_ou <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair", "isover")
pcaBEx_ou <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair Exchange", "isover")
pca1xB_1 <- pca.analysis(odd_details_wide_filtered, matches,
"1xBet", "is1")
pcacom_1 <- pca.analysis(odd_details_wide_filtered, matches,
"ComeOn", "is1")
pcayou_1 <- pca.analysis(odd_details_wide_filtered, matches,
"youwin", "is1")
pcaBet_1 <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair", "is1")
pcaBEx_1 <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair Exchange", "is1")
pca1xB_2 <- pca.analysis(odd_details_wide_filtered, matches,
"1xBet", "is2")
pcacom_2 <- pca.analysis(odd_details_wide_filtered, matches,
"ComeOn", "is2")
pcayou_2 <- pca.analysis(odd_details_wide_filtered, matches,
"youwin", "is2")
pcaBet_2 <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair", "is2")
pcaBEx_2 <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair Exchange", "is2")
pca1xB_X <- pca.analysis(odd_details_wide_filtered, matches,
"1xBet", "isX")
pcacom_X <- pca.analysis(odd_details_wide_filtered, matches,
"ComeOn", "isX")
pcayou_X <- pca.analysis(odd_details_wide_filtered, matches,
"youwin", "isX")
pcaBet_X <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair", "isX")
pcaBEx_X <- pca.analysis(odd_details_wide_filtered, matches,
"Betfair Exchange", "isX")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "1xBet",
"isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "ComeOn",
"isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "youwin",
"isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "Betfair",
"isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "Betfair Exchange",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "1xBet",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "ComeOn",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "youwin",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "Betfair",
"isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "Betfair Exchange",
"isover")
require(jpeg)
require(data.table)
setwd("./Desktop/okul/Data Mining/data/")
image <- readJPEG("RGBclumsy.jpg")
str(image)
dim(image)
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 1, 1)
rasterImage(image = image[, , 1], 0, 0, 1, 1)
rasterImage(image = image[, , 2], 0, 0, 1, 1)
rasterImage(image = image[, , 3], 0, 0, 1, 1)
image_noisy <- jitter(image, amount = 0.1)
image_noisy[which(image_noisy > 1)] <- 1
image_noisy[which(image_noisy < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image_noisy, 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 1], 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 2], 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 3], 0, 0, 1, 1)
image_noisy_greyscale <- (image_noisy[, , 1] + image_noisy[,
, 2] + image_noisy[, , 3])/3
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image_noisy_greyscale, 0, 0, 1, 1)
image_small <- readJPEG("RGBclumsysmall.jpg")
str(image_small)
dim(image_small)
image_noisy_small <- jitter(image_small, amount = 0.1)
image_noisy_small[which(image_noisy_small > 1)] <- 1
image_noisy_small[which(image_noisy_small < 0)] <- 0
image_noisy_greyscale_small <- (image_noisy_small[, , 1] + image_noisy_small[,
, 2] + image_noisy_small[, , 3])/3
# 3x3 submatrices
patchdim <- 3
gap <- ((patchdim - 1)/2)
patches <- matrix(nrow = patchdim^2, ncol = ((ncol(image_noisy_greyscale_small) -
(2 * gap))^2))
colnames(patches) <- rep("noname", times = ncol(patches))
k <- 1
kmax <- ncol(patches)
for (i in (gap + 1):(nrow(image_noisy_greyscale_small) - gap)) {
for (j in (gap + 1):(ncol(image_noisy_greyscale_small) -
gap)) {
patch <- image_noisy_greyscale_small[(i - gap):(i + gap),
(j - gap):(j + gap)]
patches[, k] <- c(patch)
colnames(patches)[k] <- paste(i, j)
k <- k + 1
print(kmax - k)
}
}
patches_t <- t(patches)
colnames(patches_t) <- c("11", "21", "31", "12", "22", "32",
"13", "23", "33")
pca_patch <- princomp(patches_t)
plot(pca_patch)
summary(pca_patch)
pca_patch$loadings
mapping <- (as.data.table(pca_patch$scores)[, `:=`(pos, rownames(pca_patch$scores))][,
`:=`(c("xpos", "ypos"), tstrsplit(pos, " "))][, `:=`(c("xpos",
"ypos"), list(as.numeric(xpos), as.numeric(ypos)))][, `:=`(pos,
NULL)])
new_img1 <- matrix(nrow = (ncol(image_noisy_greyscale_small) -
(2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 *
gap)))
for (i in 1:nrow(new_img1)) {
for (j in 1:ncol(new_img1)) {
new_img1[i, j] <- mapping[(xpos == (i + 1)) & (ypos ==
(j + 1))][, Comp.1]
}
print(i)
}
new_img1[which(new_img1 > 1)] <- 1
new_img1[which(new_img1 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img1, 0, 0, 1, 1)
new_img2 <- matrix(nrow = (ncol(image_noisy_greyscale_small) -
(2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 *
gap)))
for (i in 1:nrow(new_img2)) {
for (j in 1:ncol(new_img2)) {
new_img2[i, j] <- mapping[(xpos == (i + 1)) & (ypos ==
(j + 1))][, Comp.2]
}
print(i)
}
new_img2[which(new_img2 > 1)] <- 1
new_img2[which(new_img2 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img2, 0, 0, 1, 1)
new_img3 <- matrix(nrow = (ncol(image_noisy_greyscale_small) -
(2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 *
gap)))
for (i in 1:nrow(new_img3)) {
for (j in 1:ncol(new_img3)) {
new_img3[i, j] <- mapping[(xpos == (i + 1)) & (ypos ==
(j + 1))][, Comp.3]
}
print(i)
}
new_img3[which(new_img3 > 1)] <- 1
new_img3[which(new_img3 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img3, 0, 0, 1, 1)
small_img1 <- matrix(pca_patch$loadings[, 1], 3, 3)
small_img1[which(small_img1 > 1)] <- 1
small_img1[which(small_img1 < 0)] <- 0
image(small_img1, col = gray((0:255)/255))
small_img2 <- matrix(pca_patch$loadings[, 2], 3, 3)
small_img2[which(small_img2 > 1)] <- 1
small_img2[which(small_img2 < 0)] <- 0
image(small_img2, col = gray((0:255)/255))
small_img3 <- matrix(pca_patch$loadings[, 3], 3, 3)
small_img3[which(small_img3 > 1)] <- 1
small_img3[which(small_img3 < 0)] <- 0
image(small_img3, col = gray((0:255)/255))